The main aim of this activity is just to create a dataset of synonyms and antonyms of english words.
All the data belongs to its original creator at https://www.thesaurus.com/
Note:
Synonym and antonym should be placed in same row as a single entry with words separated by ‘;’.
library(rvest)
package 㤼㸱rvest㤼㸲 was built under R version 3.5.3Loading required package: xml2
package 㤼㸱xml2㤼㸲 was built under R version 3.5.3
library(stringr)
package 㤼㸱stringr㤼㸲 was built under R version 3.5.3
I created this dataset displaying how many webpages are available for all words starting with particular letter.
thesaurus_data <- read.csv("~/R/Thesaurus_web_scraping/thesaurus_letter_page_number.csv")
print(thesaurus_data)
word_data <- data.frame(word=character(0))
total_letters <- 26
Following loop will call for each link and will get all the words for all 26 letters.
This took me about 6 minutes 30 seconds.
for (i in 1:total_letters) {
base_url <- as.character(thesaurus_data[i,2])
letter_total_page <- thesaurus_data[i,3]
letter_page_link <- character(length(letter_total_page))
for(j in 1:letter_total_page) {
letter_url <- paste(base_url,j,sep = "")
letter_webpage <- read_html(letter_url)
letter_html_words <- html_nodes(letter_webpage, '.e1j8zk4s1')
letter_page_words <- html_text(letter_html_words)
letter_page_words <- as.data.frame(letter_page_words,row.names = NULL)
word_data <- rbind(word_data, letter_page_words)
}
}
All the words that we got till now are:
nrow(word_data)
[1] 133322
head(word_data)
total_word <- nrow(word_data)
browse_url <- "https://www.thesaurus.com/browse/"
thesaurus <- data.frame(word=character(0),browser_word=character(0),
url=character(0),synoyms=character(0),antonyms=character(0))
This took around 52 hours 8 minutes to complete the scraping
for (i in 131903:total_word) {
actual_word <- word_data[i,1]
current_word <- str_replace_all(actual_word, " ","%20")
current_word <- str_replace_all(current_word, "'","%27")
current_url <-paste(browse_url,current_word,sep = "")
word_webpage <- read_html(current_url)
#Get all synonym words
syn_words_html <- html_nodes(word_webpage, '.en1b8750+ .e1qo4u830 .et6tpn80')
syn_words <- html_text(html_children(syn_words_html[1]))
#Get all antonym words
ant_words_html <- html_nodes(word_webpage, '.em66cyi0+ .e1qo4u830 .et6tpn80')
ant_words <- html_text(html_children(ant_words_html[1]))
#update the dataframe
synoyms <- toString(syn_words)
antonyms <- toString(ant_words)
temp_data <- data.frame(actual_word,current_word,current_url,synoyms,antonyms)
thesaurus <- rbind(thesaurus,temp_data)
print(paste0("passed number: ", i, " word : ", current_word))
}
head(thesaurus)